This is a House Price Prediction Using Linear Regression Model
At first we install all the required packages for the linear regression.
# install.packages('readr', repos = "http://cran.us.r-project.org")
# install.packages('ggplot2', repos = "http://cran.us.r-project.org")
# install.packages('mlbench', repos = "http://cran.us.r-project.org")
# install.packages('corrplot', repos = "http://cran.us.r-project.org")
# install.packages('Amelia', repos = "http://cran.us.r-project.org")
# install.packages('caret', repos = "http://cran.us.r-project.org")
# install.packages('plotly', repos = "http://cran.us.r-project.org")
# install.packages('caTools', repos = "http://cran.us.r-project.org")
# install.packages('reshape2', repos = "http://cran.us.r-project.org")
# install.packages('dplyr', repos = "http://cran.us.r-project.org")
library(readr)
library(ggplot2)
library(corrplot)
library(mlbench)
library(Amelia)
library(plotly)
library(reshape2)
library(caret)
library(caTools)
library(dplyr)
We input the cleaned dataset
data(Housing)
housing <- Housing
corrplot(cor(select(housing,-chas)))
housing %>%
ggplot(aes(medv)) +
stat_density() +
theme_bw()
ggplotly(housing %>%
ggplot(aes(medv)) +
stat_density() +
theme_bw())
housing %>%
select(c(crim, rm, age, rad, tax, lstat, medv,indus,nox,ptratio,zn)) %>%
melt(id.vars = "medv") %>%
ggplot(aes(x = value, y = medv, colour = variable)) +
geom_point(alpha = 0.7) +
stat_smooth(aes(colour = "black")) +
facet_wrap(~variable, scales = "free", ncol = 2) +
labs(x = "Variable Value", y = "Median House Price ($1000s)") +
theme_minimal()
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
pseudoinverse used at -0.5
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
neighborhood radius 13
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
reciprocal condition number 4.5194e-15
Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric, :
There are other near singularities as well. 156.25
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
pseudoinverse used at -0.5
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
neighborhood radius 13
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
reciprocal condition number 4.5194e-15
Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x else if (is.data.frame(newdata)) as.matrix(model.frame(delete.response(terms(object)), :
There are other near singularities as well. 156.25
set.seed(123)
split <- sample.split(housing,SplitRatio =0.75)
train <- subset(housing,split==TRUE)
test <- subset(housing,split==FALSE)
model <- lm(medv ~ crim + rm + tax + age + nox + lstat , data = train)
summary(model)
Call:
lm(formula = medv ~ crim + rm + tax + age + nox + lstat, data = train)
Residuals:
Min 1Q Median 3Q Max
-16.9490 -3.2287 -0.9126 2.2436 29.2666
Coefficients:
Estimate Std. Error t value Pr(>|t|)
(Intercept) -4.090441 3.871134 -1.057 0.2914
crim -0.075304 0.036483 -2.064 0.0397 *
rm 5.457783 0.513535 10.628 < 2e-16 ***
tax -0.005835 0.002350 -2.483 0.0135 *
age 0.014093 0.015452 0.912 0.3624
nox 1.185187 3.840734 0.309 0.7578
lstat -0.538999 0.068160 -7.908 3.35e-14 ***
---
Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
Residual standard error: 5.206 on 355 degrees of freedom
Multiple R-squared: 0.6769, Adjusted R-squared: 0.6714
F-statistic: 124 on 6 and 355 DF, p-value: < 2.2e-16
res <- residuals(model)
res <- as.data.frame(res)
ggplot(res,aes(res)) + geom_histogram(fill='blue',alpha=0.5)
`stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
plot(model)
test$predicted.medv <- predict(model,test)
pl1 <-test %>%
ggplot(aes(medv,predicted.medv)) +
geom_point(alpha=0.5) +
stat_smooth(aes(colour='black')) +
xlab('Actual value of medv') +
ylab('Predicted value of medv')+
theme_bw()
ggplotly(pl1)
`geom_smooth()` using method = 'loess' and formula 'y ~ x'
error <- test$medv-test$predicted.medv
rmse <- sqrt(mean(error)^2)